{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "Data Deployment.ipynb",
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "EROAwrOzmR_P",
        "colab_type": "text"
      },
      "source": [
        "![人工智慧 - 自由團隊](https://raw.githubusercontent.com/chenkenanalytic/img/master/af/aifreeteam.png)\n",
        "\n",
        "\n",
        "<center>Welcome to the data deployment practice of Traditional Chinese Handwriting Dataset by AI . FREE Team.</center>\n",
        "<br>\n",
        "<center>歡迎大家來到 AI . FREE Team 所開發的繁體中文手寫資料部署實作。 </center>\n",
        "<br>\n",
        "\n",
        "<center>(Author: Chen Ken, Yen-Lin 博士；Date of published: 2020/4/21；AI . FREE Team Website: https://aifreeblog.herokuapp.com/)</center>"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "kNI33w0Fbakh",
        "colab_type": "code",
        "outputId": "3a3636e8-1f47-41f8-ab70-c22b62058f3a",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 123
        }
      },
      "source": [
        "!git clone https://github.com/AI-FREE-Team/Traditional-Chinese-Handwriting-Dataset.git"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Cloning into 'Traditional-Chinese-Handwriting-Dataset'...\n",
            "remote: Enumerating objects: 38, done.\u001b[K\n",
            "remote: Counting objects: 100% (38/38), done.\u001b[K\n",
            "remote: Compressing objects: 100% (37/37), done.\u001b[K\n",
            "remote: Total 56 (delta 13), reused 11 (delta 1), pack-reused 18\u001b[K\n",
            "Unpacking objects: 100% (56/56), done.\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "-69nnOJpcLZt",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "import os\n",
        "import zipfile\n",
        "import shutil\n",
        "\n",
        "OutputFolder = '/content/Handwritten_Data'"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "gDA6SQQobuEG",
        "colab_type": "code",
        "outputId": "931f11d9-5674-4fc1-9e7c-fee3b2a166a4",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 52
        }
      },
      "source": [
        "if not os.path.exists(OutputFolder):\n",
        "  os.mkdir(OutputFolder)\n",
        "  print( f'Create the new \"{OutputFolder}\" folder' )\n",
        "\n",
        "os.chdir(OutputFolder)\n",
        "\n",
        "### 檢查路徑\n",
        "!pwd"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Create the new \"/content/Handwritten_Data\" folder\n",
            "/content/Handwritten_Data\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "GcO5DNn_cqpR",
        "colab_type": "code",
        "outputId": "1ccd40ab-89ae-40d7-e913-8a94c7a16047",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 123
        }
      },
      "source": [
        "CompressedFiles = []\n",
        "\n",
        "os.chdir('/content/Traditional-Chinese-Handwriting-Dataset/data')\n",
        "\n",
        "for item in os.listdir():  \n",
        "  if item.endswith('.zip'): # Check for \".zip\" extension.\n",
        "    file_path = os.path.abspath(item) # Get full path of the compressed file. \n",
        "    CompressedFiles.append(file_path)\n",
        "\n",
        "for file in CompressedFiles:     \n",
        "  # Construct a ZipFile object with the filename, and then extract it.\n",
        "  zip_ref = zipfile.ZipFile(file).extractall(OutputFolder) \n",
        "  \n",
        "  source_path = OutputFolder + '/cleaned_data(50_50)'\n",
        "  img_list = os.listdir(source_path)\n",
        "\n",
        "  for img in img_list:\n",
        "      shutil.move(source_path + '/' + img, OutputFolder) # Move a file to another location. \n",
        "  \n",
        "  shutil.rmtree(OutputFolder + '/cleaned_data(50_50)') \n",
        "  print(f'Decompress successfully {file} ......')\n",
        "\n",
        "print( 'Moving images according to traditional Chinese characters......' )\n",
        "\n",
        "ImageList = os.listdir(OutputFolder)\n",
        "ImageList = [img for img in ImageList if len(img)>1]\n",
        "WordList = list(set([w.split('_')[0] for w in ImageList]))\n",
        "\n",
        "for w in WordList:\n",
        "  try:\n",
        "    os.chdir(OutputFolder) # Change the current working directory to OutputPath.\n",
        "    os.mkdir(w) # Create the new word folder in OutputPath.\n",
        "    MoveList = [img for img in ImageList if w in img]\n",
        "                \n",
        "  except: \n",
        "    os.chdir(OutputFolder)\n",
        "    MoveList = [ img for img in ImageList if w in img ]\n",
        "  \n",
        "  finally:            \n",
        "    for img in MoveList:\n",
        "      old_path = OutputFolder + '/' + img\n",
        "      new_path = OutputFolder + '/' + w + '/' + img\n",
        "      shutil.move( old_path, new_path )\n",
        "\n",
        "print( 'Data Deployment completed.' )"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Decompress successfully /content/Traditional-Chinese-Handwriting-Dataset/data/cleaned_data(50_50)-20200420T071507Z-004.zip ......\n",
            "Decompress successfully /content/Traditional-Chinese-Handwriting-Dataset/data/cleaned_data(50_50)-20200420T071507Z-001.zip ......\n",
            "Decompress successfully /content/Traditional-Chinese-Handwriting-Dataset/data/cleaned_data(50_50)-20200420T071507Z-003.zip ......\n",
            "Decompress successfully /content/Traditional-Chinese-Handwriting-Dataset/data/cleaned_data(50_50)-20200420T071507Z-002.zip ......\n",
            "Moving images according to traditional Chinese characters......\n",
            "Data Deployment completed.\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "BtJidZSSed2C",
        "colab_type": "code",
        "outputId": "e74675ca-99ae-4de4-aef1-d471f9c05b3c",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 52
        }
      },
      "source": [
        "a=0\n",
        "b=0\n",
        "\n",
        "for item in os.listdir(OutputFolder):\n",
        "  a += 1\n",
        "  for i in os.listdir(OutputFolder + '/' + item):\n",
        "    b +=1\n",
        "\n",
        "\n",
        "print('總共: ' + str(a) + ' 個字(資料夾) / 總共: ' + str(b) + '個樣本')\n",
        "print('平均每個字有: ' + str(b/a) + ' 個樣本')"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "總共: 4803 個字(資料夾) / 總共: 250712個樣本\n",
            "平均每個字有: 52.19904226525089 個樣本\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "-HlCG5i4kDnS",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        ""
      ],
      "execution_count": 0,
      "outputs": []
    }
  ]
}